# coding:utf-8

from scrapy.contrib.spiders import CrawlSpider
from ..items import IfengItem
import scrapy
import bs4


class IfangSpider(CrawlSpider):
    #
    # 抓取凤凰网 规则
    #
    name = 'ifengspider'
    allowed_domains = ['house.ifeng.com']
    start_urls = ['http://house.ifeng.com/']

    def parse(self, response):
        data = '''<ul id="city_hot1" class="city-con-list tc_con" ><li>
        <a href="javascript:;" siteid="3066" hid="1" sitedomain="">北京</a></li><li>
        <a href="javascript:;" siteid="27504" hid="2" sitedomain="sh">上海</a></li>
        <li><a href="javascript:;" siteid="17632" hid="4" sitedomain="gz">广州</a></li>
        <li><a href="javascript:;" siteid="36688" hid="3" sitedomain="sz">深圳</a></li>
        <li><a href="javascript:;" siteid="27921" hid="5" sitedomain="tj">天津</a></li>
        <li><a href="javascript:;" siteid="17649" hid="13" sitedomain="hn">海南</a></li>
        <li><a href="javascript:;" siteid="24718" hid="30" sitedomain="km">昆明</a></li>
        <li><a href="javascript:;" siteid="23448" hid="24" sitedomain="xa">西安</a></li>
        <li><a href="javascript:;" siteid="23984" hid="15" sitedomain="zz">郑州</a></li>
        <li><a href="javascript:;" siteid="24431" hid="8" sitedomain="wuhan">武汉</a></li>
        <li><a href="javascript:;" siteid="24396" hid="9" sitedomain="changsha">长沙</a></li>
        <li><a href="javascript:;" siteid="21054" hid="20" sitedomain="jn">济南</a></li>
        <li><a href="javascript:;" siteid="21057" hid="12" sitedomain="cd">成都</a></li>
        <li><a href="javascript:;" siteid="50007" hid="999" sitedomain="cq">重庆</a></li>
        <li><a href="javascript:;" siteid="18691" hid="7" sitedomain="sy">沈阳</a></li>
        <li><a href="javascript:;" siteid="50008" hid="999" sitedomain="hf">合肥</a></li>
        <li><a href="javascript:;" siteid="19003" hid="31" sitedomain="nn">南宁</a></li>
        <li><a href="javascript:;" siteid="18693" hid="11" sitedomain="nj">南京</a></li>
        <li><a href="javascript:;" siteid="21056" hid="28" sitedomain="heb">哈尔滨</a></li>
        <li><a href="javascript:;" siteid="20447" hid="32" sitedomain="cc">长春</a></li>
        <li><a href="javascript:;" siteid="22632" hid="10" sitedomain="hz">杭州</a></li>
        <li><a href="javascript:;" siteid="21055" hid="14" sitedomain="qd">青岛</a></li>
        <li><a href="javascript:;" siteid="20778" hid="16" sitedomain="dl">大连</a></li>
        <li><a href="javascript:;" siteid="50029" hid="999" sitedomain="lanzhou">兰州</a></li>
        <li><a href="javascript:;" siteid="50016" hid="999" sitedomain="zhuhai">珠海</a></li>
        <li><a href="javascript:;" siteid="23064" hid="999" sitedomain="zibo">淄博</a></li>
        <li><a href="javascript:;" siteid="23242" hid="999" sitedomain="weifang">潍坊</a></li>
        <li><a href="javascript:;" siteid="50013" hid="999" sitedomain="nanchang">南昌</a></li>
        <li><a href="javascript:;" siteid="50033" hid="999" sitedomain="wuxi">无锡</a></li>
        <li><a href="javascript:;" siteid="50027" hid="999" sitedomain="foshan">佛山</a></li>
        <li><a href="javascript:;" siteid="50018" hid="999" sitedomain="zhongshan">中山</a></li>
        <li><a href="javascript:;" siteid="50014" hid="999" sitedomain="st">汕头</a></li>
        <li><a href="javascript:;" siteid="50015" hid="999" sitedomain="wuhu">芜湖</a></li>
        <li><a href="javascript:;" siteid="50017" hid="999" sitedomain="fuyang">阜阳</a></li>
        <li><a href="javascript:;" siteid="50020" hid="999" sitedomain="liuzhou">柳州</a></li>
        <li><a href="javascript:;" siteid="50019" hid="999" sitedomain="shaoguan">韶关</a></li>
        <li><a href="javascript:;" siteid="50025" hid="999" sitedomain="enshi">恩施</a></li>
        <li><a href="javascript:;" siteid="50026" hid="999" sitedomain="dali">大理</a></li>
        <li><a href="javascript:;" siteid="50030" hid="999" sitedomain="xiangyang">襄阳</a></li>
        <li><a href="javascript:;" siteid="50032" hid="999" sitedomain="yichang">宜昌</a></li>
        <li><a href="javascript:;" siteid="50034" hid="999" sitedomain="luzhou">泸州</a></li>
        </ul> '''
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 获取所有城市
        #

        citybox = soup.find_all('li')
        for child in citybox:
            city = child.a.get_text()
            city_id = child.a.get('sitedomain')
            city_url = 'http://' + child.a.get('sitedomain') + '.house.ifeng.com/search'
            print(city, city_url)
            if city == '北京':
                city_id = 'house'
                city_url = 'http://house.ifeng.com/search'
            citys = {
                'website': '凤凰', 'web_url': 'house.ifeng.com',
                'city': city, 'city_id': city_id, 'city_url': city_url
            }
            # if city == '北京':
            yield scrapy.Request(city_url, callback=self.parse_area_estate, meta=citys)

    def parse_area_estate(self, response):
        meta = response.meta
        data = response.body
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 获取城市下面的区域
        #

        areabox = soup.select('.suo_nav')[0]
        arealist = areabox.find_all('li')
        for child in arealist:
            area = child.a.get_text().strip()
            if area == '周边地区' or area == '异地':
                area = '其他'
            meta['area'] = area

            if area != '全部':
                area_url = child.a.get('href')
                meta['area_url'] = area_url
                yield scrapy.Request(area_url, callback=self.parse_city_estate, meta=meta)

    def parse_city_estate(self, response):
        meta = response.meta
        area_url = meta['area_url']
        areaid = area_url.split('/')[-1]
        data = response.body
        urls = response.url.split('/')[-1]
        # print('当前',urls)
        now_page = 0
        if urls != areaid:
            now_page = int(urls.replace(areaid + '?page=', ''))

        soup = bs4.BeautifulSoup(data, 'html5lib')

        #
        # 获取楼盘信息
        #

        estatebox = soup.select('.search_list_title > a')
        for child in estatebox:
            estate = child.get_text().strip()
            estate_url = child.get('href')
            estate_id = child.get('href').split('/')[-1]
            item = IfengItem()
            item['website'] = meta['website']
            item['web_url'] = meta['web_url']
            item['city'] = meta['city']
            item['city_id'] = meta['city_id']
            item['area'] = meta['area']
            item['estate'] = estate
            item['estate_id'] = estate_id
            item['estate_url'] = estate_url
            yield item

        #
        # 计算页数 进行翻页
        #

        totolnum = soup.select('#total')[0].get_text().strip()
        page_num = int(totolnum) % 30
        if page_num == 0:
            page_num = int(totolnum) / 30
        else:
            page_num = int(int(totolnum) / 30) + 1

        if now_page < page_num:
            if now_page != 0:
                next_url = response.url.replace('?page=' + str(now_page), '?page=' + str(now_page + 1))
                yield scrapy.Request(next_url, callback=self.parse_city_estate, meta=meta)
            else:
                next_url = response.url + '?page=1'
                yield scrapy.Request(next_url, callback=self.parse_city_estate, meta=meta)
